{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# test qwen2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-05-18 14:53:48,068] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
      "\u001b[93m [WARNING] \u001b[0m async_io requires the dev libaio .so object and headers but these were not found.\n",
      "\u001b[93m [WARNING] \u001b[0m async_io: please install the libaio-dev package with apt\n",
      "\u001b[93m [WARNING] \u001b[0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.\n",
      "\u001b[93m [WARNING] \u001b[0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/root/miniconda3/compiler_compat/ld: cannot find -laio: No such file or directory\n",
      "collect2: error: ld returned 1 exit status\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[93m [WARNING] \u001b[0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.3\n",
      "\u001b[93m [WARNING] \u001b[0m using untested triton version (2.3.0), only 1.0.0 is known to be compatible\n",
      "/home/calfa100/gqs/Steel-LLM/pretrain_modify_from_TinyLlama/model\n",
      "zhanshijin: surrport flash_attn_2\n",
      "zhanshijin: if flash attn surrport window:True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "mlp_class: SteelSENet\n",
      "FFN: SteelSoftMoeV3\n",
      "zhanshijin: now use _attn_implementation is sdpa, you can choose from dict_keys(['eager', 'flash_attention_2', 'sdpa'])\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "zhanshijin: use torch rmsnorm\n",
      "模型在显存中的占用大小： 4765387776 字节\n",
      "参数名称: model.embed_tokens.weight，数据类型: torch.float32\n"
     ]
    }
   ],
   "source": [
    "# 增加norm和rope的一致性\n",
    "import os\n",
    "import sys\n",
    "import torch\n",
    "import numpy  as np\n",
    "from deepspeed.profiling.flops_profiler import get_model_profile  # type: ignore\n",
    "current_dir = os.getcwd()\n",
    "print(current_dir)\n",
    "sys.path.append(os.path.join(current_dir, \"steel_modify_from_qwen_1_5\"))\n",
    "from transformers import AutoConfig\n",
    "from steel_modify_from_qwen_1_5.modeling_steel import SteelForCausalLM\n",
    "config = AutoConfig.from_pretrained(\"./steel_modify_from_qwen_1_5\",trust_remote_code=True)\n",
    "# config.use_cuda_rmsnorm = True\n",
    "# config.mlp_type = \"senet\"\n",
    "# config.FFN_type = \"softmoe_v3\"\n",
    "# config.intermediate_size = 1792\n",
    "# config.hidden_size = 1792\n",
    "# config.num_attention_heads = 32\n",
    "# config.num_hidden_layers = 18\n",
    "# config.mlp_div_ratio = 4\n",
    "# ['eager', 'flash_attention_2', 'sdpa']\n",
    "# config._attn_implementation = \"flash_attention_2\"\n",
    "model = SteelForCausalLM(config)\n",
    "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
    "model = model.to(device)\n",
    "memory_allocated = torch.cuda.memory_allocated(device)\n",
    "print(\"模型在显存中的占用大小：\", memory_allocated, \"字节\")\n",
    "for name, param in model.named_parameters():\n",
    "    print(f\"参数名称: {name}，数据类型: {param.dtype}\")\n",
    "    break\n",
    "# print(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Qwen2ForCausalLM(\n",
      "  (model): Qwen2Model(\n",
      "    (embed_tokens): Embedding(151936, 1792)\n",
      "    (layers): ModuleList(\n",
      "      (0-17): 18 x Qwen2DecoderLayer(\n",
      "        (self_attn): Qwen2SdpaAttention(\n",
      "          (q_proj): Linear(in_features=1792, out_features=1792, bias=True)\n",
      "          (k_proj): Linear(in_features=1792, out_features=1792, bias=True)\n",
      "          (v_proj): Linear(in_features=1792, out_features=1792, bias=True)\n",
      "          (o_proj): Linear(in_features=1792, out_features=1792, bias=False)\n",
      "          (rotary_emb): Qwen2RotaryEmbedding()\n",
      "        )\n",
      "        (mlp): SteelSoftMoEV3(\n",
      "          (experts): ModuleList(\n",
      "            (0-5): 6 x SteelSENet(\n",
      "              (gate_up_proj): Linear(in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU()\n",
      "              (act_fn2): SiLU()\n",
      "            )\n",
      "          )\n",
      "        )\n",
      "        (input_layernorm): Qwen2RMSNorm()\n",
      "        (post_attention_layernorm): Qwen2RMSNorm()\n",
      "      )\n",
      "    )\n",
      "    (norm): Qwen2RMSNorm()\n",
      "  )\n",
      "  (lm_head): Linear(in_features=1792, out_features=151936, bias=False)\n",
      ")\n"
     ]
    }
   ],
   "source": [
    "print(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-05-18 14:54:08,367] [INFO] [profiler.py:1218:get_model_profile] Flops profiler warming-up...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-05-18 14:54:09,958] [INFO] [profiler.py:80:start_profile] Flops profiler started\n",
      "\n",
      "-------------------------- DeepSpeed Flops Profiler --------------------------\n",
      "Profile Summary at step 1:\n",
      "Notations:\n",
      "data parallel size (dp_size), model parallel size(mp_size),\n",
      "number of parameters (params), number of multiply-accumulate operations(MACs),\n",
      "number of floating-point operations (flops), floating-point operations per second (FLOPS),\n",
      "fwd latency (forward propagation latency), bwd latency (backward propagation latency),\n",
      "step (weights update latency), iter latency (sum of fwd, bwd and step latency)\n",
      "\n",
      "params per GPU:                                                         1.12 B  \n",
      "params of model = params per GPU * mp_size:                             0       \n",
      "fwd MACs per GPU:                                                       133.47 GMACs\n",
      "fwd flops per GPU:                                                      267.23 G\n",
      "fwd flops of model = fwd flops per GPU * mp_size:                       267.23 G\n",
      "fwd latency:                                                            129.84 ms\n",
      "fwd FLOPS per GPU = fwd flops per GPU / fwd latency:                    2.06 TFLOPS\n",
      "\n",
      "----------------------------- Aggregated Profile per GPU -----------------------------\n",
      "Top 1 modules in terms of params, MACs or fwd latency at different model depths:\n",
      "depth 0:\n",
      "    params      - {'SteelForCausalLM': '1.12 B'}\n",
      "    MACs        - {'SteelForCausalLM': '133.47 GMACs'}\n",
      "    fwd latency - {'SteelForCausalLM': '129.84 ms'}\n",
      "depth 1:\n",
      "    params      - {'SteelModel': '850.65 M'}\n",
      "    MACs        - {'Linear': '69.7 GMACs'}\n",
      "    fwd latency - {'SteelModel': '128.42 ms'}\n",
      "depth 2:\n",
      "    params      - {'ModuleList': '578.38 M'}\n",
      "    MACs        - {'ModuleList': '63.76 GMACs'}\n",
      "    fwd latency - {'ModuleList': '127.08 ms'}\n",
      "depth 3:\n",
      "    params      - {'SteelDecoderLayer': '578.38 M'}\n",
      "    MACs        - {'SteelDecoderLayer': '63.76 GMACs'}\n",
      "    fwd latency - {'SteelDecoderLayer': '127.08 ms'}\n",
      "depth 4:\n",
      "    params      - {'SteelSoftMoEV3': '347.01 M'}\n",
      "    MACs        - {'SteelSdpaAttention': '63.42 GMACs'}\n",
      "    fwd latency - {'SteelSoftMoEV3': '88.94 ms'}\n",
      "depth 5:\n",
      "    params      - {'ModuleList': '346.82 M'}\n",
      "    MACs        - {'Linear': '59.19 GMACs'}\n",
      "    fwd latency - {'ModuleList': '60.94 ms'}\n",
      "depth 6:\n",
      "    params      - {'SteelSENet': '346.82 M'}\n",
      "    MACs        - {'SteelSENet': '346.82 MMACs'}\n",
      "    fwd latency - {'SteelSENet': '60.94 ms'}\n",
      "\n",
      "------------------------------ Detailed Profile per GPU ------------------------------\n",
      "Each module profile is listed after its name in the following order: \n",
      "params, percentage of total params, MACs, percentage of total MACs, fwd latency, percentage of total fwd latency, fwd FLOPS\n",
      "\n",
      "Note: 1. A module can have torch.nn.module or torch.nn.functional to compute logits (e.g. CrossEntropyLoss). They are not counted as submodules, thus not to be printed out. However they make up the difference between a parent's MACs (or latency) and the sum of its submodules'.\n",
      "2. Number of floating-point operations is a theoretical estimation, thus FLOPS computed using that could be larger than the maximum system throughput.\n",
      "3. The fwd latency listed in the top module's profile is directly captured at the module forward function in PyTorch, thus it's less than the fwd latency shown above which is captured in DeepSpeed.\n",
      "\n",
      "SteelForCausalLM(\n",
      "  1.12 B = 100% Params, 133.47 GMACs = 100% MACs, 129.84 ms = 100% latency, 2.06 TFLOPS\n",
      "  (model): SteelModel(\n",
      "    850.65 M = 75.75% Params, 63.76 GMACs = 47.78% MACs, 128.42 ms = 98.9% latency, 995.39 GFLOPS\n",
      "    (embed_tokens): Embedding(272.27 M = 24.25% Params, 0 MACs = 0% MACs, 153.78 us = 0.12% latency, 0 FLOPS, 151936, 1792)\n",
      "    (layers): ModuleList(\n",
      "      (0): SteelDecoderLayer(\n",
      "        32.13 M = 2.86% Params, 3.54 GMACs = 2.65% MACs, 8.48 ms = 6.53% latency, 837.15 GFLOPS\n",
      "        (self_attn): SteelSdpaAttention(\n",
      "          12.85 M = 1.14% Params, 3.52 GMACs = 2.64% MACs, 2.02 ms = 1.56% latency, 3.48 TFLOPS\n",
      "          (q_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 662.8 us = 0.51% latency, 2.48 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (k_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 83.21 us = 0.06% latency, 19.76 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (v_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 284.19 us = 0.22% latency, 5.79 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (o_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 102.04 us = 0.08% latency, 16.11 TFLOPS, in_features=1792, out_features=1792, bias=False)\n",
      "          (rotary_emb): SteelRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 57.22 us = 0.04% latency, 0 FLOPS)\n",
      "        )\n",
      "        (mlp): SteelSoftMoEV3(\n",
      "          19.28 M = 1.72% Params, 19.27 MMACs = 0.01% MACs, 5.53 ms = 4.26% latency, 9.96 GFLOPS\n",
      "          (experts): ModuleList(\n",
      "            (0): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 643.97 us = 0.5% latency, 9.98 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 109.67 us = 0.08% latency, 14.64 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 86.78 us = 0.07% latency, 18.5 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 67 us = 0.05% latency, 23.97 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.94 us = 0.05% latency, 25.51 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 69.38 us = 0.05% latency, 6.46 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.01 us = 0.03% latency, 43.7 MFLOPS)\n",
      "            )\n",
      "            (1): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 578.17 us = 0.45% latency, 11.11 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 87.26 us = 0.07% latency, 18.4 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.25 us = 0.06% latency, 19.52 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.18 us = 0.05% latency, 25.41 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.13 us = 0.05% latency, 25.04 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 43.63 us = 0.03% latency, 10.27 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 46.73 us = 0.04% latency, 38.35 MFLOPS)\n",
      "            )\n",
      "            (2): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 572.2 us = 0.44% latency, 11.23 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.88 us = 0.07% latency, 18.92 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.3 us = 0.06% latency, 19.75 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 65.09 us = 0.05% latency, 24.67 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 65.09 us = 0.05% latency, 24.67 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 42.68 us = 0.03% latency, 10.5 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 46.25 us = 0.04% latency, 38.74 MFLOPS)\n",
      "            )\n",
      "            (3): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 559.09 us = 0.43% latency, 11.49 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 86.07 us = 0.07% latency, 18.66 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.02 us = 0.06% latency, 19.58 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.94 us = 0.05% latency, 25.51 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.8 us = 0.05% latency, 26.41 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.72 us = 0.03% latency, 10.74 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.82 us = 0.03% latency, 45.01 MFLOPS)\n",
      "            )\n",
      "            (4): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 557.66 us = 0.43% latency, 11.52 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 85.83 us = 0.07% latency, 18.71 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.3 us = 0.06% latency, 19.75 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.9 us = 0.05% latency, 25.13 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.32 us = 0.05% latency, 26.62 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.72 us = 0.03% latency, 10.74 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.03% latency, 45.28 MFLOPS)\n",
      "            )\n",
      "            (5): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 577.21 us = 0.44% latency, 11.13 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 85.59 us = 0.07% latency, 18.76 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.92 us = 0.06% latency, 19.13 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 65.09 us = 0.05% latency, 24.67 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.47 us = 0.05% latency, 25.7 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 51.98 us = 0.04% latency, 8.62 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.01 us = 0.03% latency, 43.7 MFLOPS)\n",
      "            )\n",
      "          )\n",
      "        )\n",
      "        (input_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 329.49 us = 0.25% latency, 0 FLOPS)\n",
      "        (post_attention_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 147.34 us = 0.11% latency, 0 FLOPS)\n",
      "      )\n",
      "      (1): SteelDecoderLayer(\n",
      "        32.13 M = 2.86% Params, 3.54 GMACs = 2.65% MACs, 7.25 ms = 5.59% latency, 979.22 GFLOPS\n",
      "        (self_attn): SteelSdpaAttention(\n",
      "          12.85 M = 1.14% Params, 3.52 GMACs = 2.64% MACs, 1.54 ms = 1.18% latency, 4.58 TFLOPS\n",
      "          (q_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 317.57 us = 0.24% latency, 5.18 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (k_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 82.25 us = 0.06% latency, 19.99 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (v_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 298.98 us = 0.23% latency, 5.5 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (o_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 78.92 us = 0.06% latency, 20.83 TFLOPS, in_features=1792, out_features=1792, bias=False)\n",
      "          (rotary_emb): SteelRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 42.68 us = 0.03% latency, 0 FLOPS)\n",
      "        )\n",
      "        (mlp): SteelSoftMoEV3(\n",
      "          19.28 M = 1.72% Params, 19.27 MMACs = 0.01% MACs, 4.98 ms = 3.83% latency, 11.07 GFLOPS\n",
      "          (experts): ModuleList(\n",
      "            (0): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 586.51 us = 0.45% latency, 10.95 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 93.7 us = 0.07% latency, 17.14 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.49 us = 0.06% latency, 19.46 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 65.57 us = 0.05% latency, 24.49 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.99 us = 0.05% latency, 25.9 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 44.35 us = 0.03% latency, 10.1 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.03% latency, 45.28 MFLOPS)\n",
      "            )\n",
      "            (1): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 566.72 us = 0.44% latency, 11.34 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 87.26 us = 0.07% latency, 18.4 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.73 us = 0.06% latency, 19.41 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.61 us = 0.05% latency, 24.85 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.8 us = 0.05% latency, 26.41 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 42.44 us = 0.03% latency, 10.56 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.29 us = 0.03% latency, 44.47 MFLOPS)\n",
      "            )\n",
      "            (2): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 558.38 us = 0.43% latency, 11.51 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.68 us = 0.06% latency, 19.19 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.06 us = 0.06% latency, 19.81 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.23 us = 0.05% latency, 25.8 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.23 us = 0.05% latency, 25.8 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 42.44 us = 0.03% latency, 10.56 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.03% latency, 45.55 MFLOPS)\n",
      "            )\n",
      "            (3): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 557.66 us = 0.43% latency, 11.52 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 85.59 us = 0.07% latency, 18.76 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.21 us = 0.06% latency, 19.3 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.94 us = 0.05% latency, 25.51 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.32 us = 0.05% latency, 26.62 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.48 us = 0.03% latency, 10.8 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.03% latency, 45.55 MFLOPS)\n",
      "            )\n",
      "            (4): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 547.65 us = 0.42% latency, 11.73 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.45 us = 0.06% latency, 19.24 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 80.59 us = 0.06% latency, 19.92 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.18 us = 0.05% latency, 25.41 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.6 us = 0.05% latency, 26.94 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.82 us = 0.03% latency, 11.25 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.62 us = 0.03% latency, 46.4 MFLOPS)\n",
      "            )\n",
      "            (5): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 565.77 us = 0.44% latency, 11.36 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 87.98 us = 0.07% latency, 18.25 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.88 us = 0.07% latency, 18.92 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.37 us = 0.05% latency, 24.94 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.84 us = 0.05% latency, 26.83 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 44.58 us = 0.03% latency, 10.05 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.1 us = 0.03% latency, 45.83 MFLOPS)\n",
      "            )\n",
      "          )\n",
      "        )\n",
      "        (input_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 165.22 us = 0.13% latency, 0 FLOPS)\n",
      "        (post_attention_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 146.63 us = 0.11% latency, 0 FLOPS)\n",
      "      )\n",
      "      (2): SteelDecoderLayer(\n",
      "        32.13 M = 2.86% Params, 3.54 GMACs = 2.65% MACs, 7.09 ms = 5.46% latency, 1 TFLOPS\n",
      "        (self_attn): SteelSdpaAttention(\n",
      "          12.85 M = 1.14% Params, 3.52 GMACs = 2.64% MACs, 1.51 ms = 1.16% latency, 4.68 TFLOPS\n",
      "          (q_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 91.08 us = 0.07% latency, 18.05 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (k_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 295.16 us = 0.23% latency, 5.57 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (v_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 80.35 us = 0.06% latency, 20.46 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (o_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 78.44 us = 0.06% latency, 20.96 TFLOPS, in_features=1792, out_features=1792, bias=False)\n",
      "          (rotary_emb): SteelRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 41.25 us = 0.03% latency, 0 FLOPS)\n",
      "        )\n",
      "        (mlp): SteelSoftMoEV3(\n",
      "          19.28 M = 1.72% Params, 19.27 MMACs = 0.01% MACs, 5.09 ms = 3.92% latency, 10.81 GFLOPS\n",
      "          (experts): ModuleList(\n",
      "            (0): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 587.7 us = 0.45% latency, 10.93 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 94.89 us = 0.07% latency, 16.92 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.92 us = 0.06% latency, 19.13 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.13 us = 0.05% latency, 25.04 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.99 us = 0.05% latency, 25.9 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 45.54 us = 0.04% latency, 9.84 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.48 us = 0.03% latency, 43.2 MFLOPS)\n",
      "            )\n",
      "            (1): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 581.03 us = 0.45% latency, 11.06 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 85.83 us = 0.07% latency, 18.71 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 92.51 us = 0.07% latency, 17.36 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 66.76 us = 0.05% latency, 24.05 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.47 us = 0.05% latency, 25.7 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.48 us = 0.03% latency, 10.8 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 42.92 us = 0.03% latency, 41.76 MFLOPS)\n",
      "            )\n",
      "            (2): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 665.66 us = 0.51% latency, 9.65 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 92.98 us = 0.07% latency, 17.27 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.73 us = 0.06% latency, 19.41 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 140.19 us = 0.11% latency, 11.45 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 66.76 us = 0.05% latency, 24.05 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 43.15 us = 0.03% latency, 10.38 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 44.35 us = 0.03% latency, 40.41 MFLOPS)\n",
      "            )\n",
      "            (3): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 575.07 us = 0.44% latency, 11.17 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 86.78 us = 0.07% latency, 18.5 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 89.41 us = 0.07% latency, 17.96 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.13 us = 0.05% latency, 25.04 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.08 us = 0.05% latency, 26.72 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 42.68 us = 0.03% latency, 10.5 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.82 us = 0.03% latency, 45.01 MFLOPS)\n",
      "            )\n",
      "            (4): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 558.61 us = 0.43% latency, 11.5 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.4 us = 0.07% latency, 19.02 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.21 us = 0.06% latency, 19.3 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.42 us = 0.05% latency, 25.32 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.04 us = 0.05% latency, 26.31 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 42.68 us = 0.03% latency, 10.5 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.82 us = 0.03% latency, 45.01 MFLOPS)\n",
      "            )\n",
      "            (5): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 555.04 us = 0.43% latency, 11.58 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.92 us = 0.06% latency, 19.13 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 80.11 us = 0.06% latency, 20.04 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.61 us = 0.05% latency, 24.85 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.51 us = 0.05% latency, 26.1 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.77 us = 0.03% latency, 10.99 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.01 us = 0.03% latency, 43.7 MFLOPS)\n",
      "            )\n",
      "          )\n",
      "        )\n",
      "        (input_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 161.65 us = 0.12% latency, 0 FLOPS)\n",
      "        (post_attention_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 145.91 us = 0.11% latency, 0 FLOPS)\n",
      "      )\n",
      "      (3): SteelDecoderLayer(\n",
      "        32.13 M = 2.86% Params, 3.54 GMACs = 2.65% MACs, 7.18 ms = 5.53% latency, 989.6 GFLOPS\n",
      "        (self_attn): SteelSdpaAttention(\n",
      "          12.85 M = 1.14% Params, 3.52 GMACs = 2.64% MACs, 1.5 ms = 1.15% latency, 4.7 TFLOPS\n",
      "          (q_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 307.8 us = 0.24% latency, 5.34 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (k_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 91.55 us = 0.07% latency, 17.96 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (v_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 287.06 us = 0.22% latency, 5.73 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (o_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 77.49 us = 0.06% latency, 21.22 TFLOPS, in_features=1792, out_features=1792, bias=False)\n",
      "          (rotary_emb): SteelRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 41.48 us = 0.03% latency, 0 FLOPS)\n",
      "        )\n",
      "        (mlp): SteelSoftMoEV3(\n",
      "          19.28 M = 1.72% Params, 19.27 MMACs = 0.01% MACs, 4.97 ms = 3.82% latency, 11.09 GFLOPS\n",
      "          (experts): ModuleList(\n",
      "            (0): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 589.85 us = 0.45% latency, 10.89 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 93.7 us = 0.07% latency, 17.14 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.92 us = 0.06% latency, 19.13 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.13 us = 0.05% latency, 25.04 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.23 us = 0.05% latency, 25.8 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 47.45 us = 0.04% latency, 9.44 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.29 us = 0.03% latency, 44.47 MFLOPS)\n",
      "            )\n",
      "            (1): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 573.16 us = 0.44% latency, 11.21 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 86.31 us = 0.07% latency, 18.6 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.21 us = 0.06% latency, 19.3 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 71.05 us = 0.05% latency, 22.6 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.75 us = 0.05% latency, 26 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.25 us = 0.03% latency, 10.86 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.96 us = 0.03% latency, 42.71 MFLOPS)\n",
      "            )\n",
      "            (2): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 574.35 us = 0.44% latency, 11.19 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 85.59 us = 0.07% latency, 18.76 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 90.36 us = 0.07% latency, 17.77 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 65.33 us = 0.05% latency, 24.58 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.75 us = 0.05% latency, 26 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.25 us = 0.03% latency, 10.86 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.53 us = 0.03% latency, 44.21 MFLOPS)\n",
      "            )\n",
      "            (3): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 552.42 us = 0.43% latency, 11.63 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 85.12 us = 0.07% latency, 18.86 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.06 us = 0.06% latency, 19.81 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.18 us = 0.05% latency, 25.41 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.84 us = 0.05% latency, 26.83 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.77 us = 0.03% latency, 10.99 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.62 us = 0.03% latency, 46.4 MFLOPS)\n",
      "            )\n",
      "            (4): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 553.37 us = 0.43% latency, 11.61 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.97 us = 0.06% latency, 19.35 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.06 us = 0.06% latency, 19.81 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.42 us = 0.05% latency, 25.32 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.04 us = 0.05% latency, 26.31 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 42.68 us = 0.03% latency, 10.5 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.39 us = 0.03% latency, 46.68 MFLOPS)\n",
      "            )\n",
      "            (5): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 569.58 us = 0.44% latency, 11.28 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.68 us = 0.06% latency, 19.19 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 87.74 us = 0.07% latency, 18.3 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 66.52 us = 0.05% latency, 24.14 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.75 us = 0.05% latency, 26 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.01 us = 0.03% latency, 10.92 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.77 us = 0.03% latency, 43.95 MFLOPS)\n",
      "            )\n",
      "          )\n",
      "        )\n",
      "        (input_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 152.59 us = 0.12% latency, 0 FLOPS)\n",
      "        (post_attention_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 147.58 us = 0.11% latency, 0 FLOPS)\n",
      "      )\n",
      "      (4): SteelDecoderLayer(\n",
      "        32.13 M = 2.86% Params, 3.54 GMACs = 2.65% MACs, 6.94 ms = 5.34% latency, 1.02 TFLOPS\n",
      "        (self_attn): SteelSdpaAttention(\n",
      "          12.85 M = 1.14% Params, 3.52 GMACs = 2.64% MACs, 1.49 ms = 1.15% latency, 4.72 TFLOPS\n",
      "          (q_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 90.6 us = 0.07% latency, 18.15 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (k_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 289.2 us = 0.22% latency, 5.69 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (v_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 78.68 us = 0.06% latency, 20.9 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (o_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 78.92 us = 0.06% latency, 20.83 TFLOPS, in_features=1792, out_features=1792, bias=False)\n",
      "          (rotary_emb): SteelRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 40.53 us = 0.03% latency, 0 FLOPS)\n",
      "        )\n",
      "        (mlp): SteelSoftMoEV3(\n",
      "          19.28 M = 1.72% Params, 19.27 MMACs = 0.01% MACs, 4.96 ms = 3.82% latency, 11.1 GFLOPS\n",
      "          (experts): ModuleList(\n",
      "            (0): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 588.89 us = 0.45% latency, 10.91 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 92.74 us = 0.07% latency, 17.31 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.92 us = 0.06% latency, 19.13 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.61 us = 0.05% latency, 24.85 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.47 us = 0.05% latency, 25.7 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 43.63 us = 0.03% latency, 10.27 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.03% latency, 44.74 MFLOPS)\n",
      "            )\n",
      "            (1): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 561.71 us = 0.43% latency, 11.44 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 86.55 us = 0.07% latency, 18.55 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.3 us = 0.06% latency, 19.75 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.66 us = 0.05% latency, 25.22 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.56 us = 0.05% latency, 26.51 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.72 us = 0.03% latency, 10.74 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.01 us = 0.03% latency, 43.7 MFLOPS)\n",
      "            )\n",
      "            (2): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 564.58 us = 0.43% latency, 11.38 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.16 us = 0.06% latency, 19.08 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.68 us = 0.06% latency, 19.19 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.42 us = 0.05% latency, 25.32 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.75 us = 0.05% latency, 26 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 47.45 us = 0.04% latency, 9.44 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.03% latency, 45.28 MFLOPS)\n",
      "            )\n",
      "            (3): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 551.94 us = 0.43% latency, 11.64 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.21 us = 0.06% latency, 19.3 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 79.63 us = 0.06% latency, 20.16 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.94 us = 0.05% latency, 25.51 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.94 us = 0.05% latency, 25.51 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.25 us = 0.03% latency, 10.86 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.1 us = 0.03% latency, 45.83 MFLOPS)\n",
      "            )\n",
      "            (4): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 565.77 us = 0.44% latency, 11.36 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 93.94 us = 0.07% latency, 17.09 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.54 us = 0.06% latency, 19.69 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.7 us = 0.05% latency, 25.61 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.37 us = 0.05% latency, 27.05 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 42.44 us = 0.03% latency, 10.56 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.82 us = 0.03% latency, 45.01 MFLOPS)\n",
      "            )\n",
      "            (5): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 560.05 us = 0.43% latency, 11.47 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 90.84 us = 0.07% latency, 17.68 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 80.59 us = 0.06% latency, 19.92 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.18 us = 0.05% latency, 25.41 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.75 us = 0.05% latency, 26 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.01 us = 0.03% latency, 10.92 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.86 us = 0.03% latency, 46.11 MFLOPS)\n",
      "            )\n",
      "          )\n",
      "        )\n",
      "        (input_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 156.64 us = 0.12% latency, 0 FLOPS)\n",
      "        (post_attention_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 146.15 us = 0.11% latency, 0 FLOPS)\n",
      "      )\n",
      "      (5): SteelDecoderLayer(\n",
      "        32.13 M = 2.86% Params, 3.54 GMACs = 2.65% MACs, 7.18 ms = 5.53% latency, 989.63 GFLOPS\n",
      "        (self_attn): SteelSdpaAttention(\n",
      "          12.85 M = 1.14% Params, 3.52 GMACs = 2.64% MACs, 1.49 ms = 1.15% latency, 4.72 TFLOPS\n",
      "          (q_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 315.9 us = 0.24% latency, 5.2 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (k_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 81.78 us = 0.06% latency, 20.11 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (v_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 282.76 us = 0.22% latency, 5.81 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (o_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 77.01 us = 0.06% latency, 21.35 TFLOPS, in_features=1792, out_features=1792, bias=False)\n",
      "          (rotary_emb): SteelRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 43.39 us = 0.03% latency, 0 FLOPS)\n",
      "        )\n",
      "        (mlp): SteelSoftMoEV3(\n",
      "          19.28 M = 1.72% Params, 19.27 MMACs = 0.01% MACs, 4.98 ms = 3.83% latency, 11.07 GFLOPS\n",
      "          (experts): ModuleList(\n",
      "            (0): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 593.19 us = 0.46% latency, 10.83 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 94.89 us = 0.07% latency, 16.92 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.4 us = 0.07% latency, 19.02 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.9 us = 0.05% latency, 25.13 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.94 us = 0.05% latency, 25.51 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 46.01 us = 0.04% latency, 9.74 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.03% latency, 44.74 MFLOPS)\n",
      "            )\n",
      "            (1): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 562.67 us = 0.43% latency, 11.42 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 85.83 us = 0.07% latency, 18.71 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.97 us = 0.06% latency, 19.35 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.42 us = 0.05% latency, 25.32 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.47 us = 0.05% latency, 25.7 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 42.2 us = 0.03% latency, 10.62 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.03% latency, 44.74 MFLOPS)\n",
      "            )\n",
      "            (2): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 551.22 us = 0.42% latency, 11.66 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.4 us = 0.07% latency, 19.02 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 79.87 us = 0.06% latency, 20.1 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.18 us = 0.05% latency, 25.41 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.6 us = 0.05% latency, 26.94 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.48 us = 0.03% latency, 10.8 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.03% latency, 45.28 MFLOPS)\n",
      "            )\n",
      "            (3): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 558.61 us = 0.43% latency, 11.5 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.16 us = 0.06% latency, 19.08 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 80.35 us = 0.06% latency, 19.98 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.94 us = 0.05% latency, 25.51 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.84 us = 0.05% latency, 26.83 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 47.68 us = 0.04% latency, 9.4 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.03% latency, 45.28 MFLOPS)\n",
      "            )\n",
      "            (4): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 547.41 us = 0.42% latency, 11.74 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.54 us = 0.06% latency, 19.69 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 79.63 us = 0.06% latency, 20.16 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.37 us = 0.05% latency, 24.94 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.08 us = 0.05% latency, 26.72 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.03% latency, 11.32 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.82 us = 0.03% latency, 45.01 MFLOPS)\n",
      "            )\n",
      "            (5): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 572.92 us = 0.44% latency, 11.21 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.21 us = 0.06% latency, 19.3 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 80.11 us = 0.06% latency, 20.04 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 68.9 us = 0.05% latency, 23.3 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.99 us = 0.05% latency, 25.9 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.72 us = 0.03% latency, 10.74 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.25 us = 0.03% latency, 43.45 MFLOPS)\n",
      "            )\n",
      "          )\n",
      "        )\n",
      "        (input_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 154.97 us = 0.12% latency, 0 FLOPS)\n",
      "        (post_attention_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 146.15 us = 0.11% latency, 0 FLOPS)\n",
      "      )\n",
      "      (6): SteelDecoderLayer(\n",
      "        32.13 M = 2.86% Params, 3.54 GMACs = 2.65% MACs, 6.89 ms = 5.3% latency, 1.03 TFLOPS\n",
      "        (self_attn): SteelSdpaAttention(\n",
      "          12.85 M = 1.14% Params, 3.52 GMACs = 2.64% MACs, 1.47 ms = 1.13% latency, 4.8 TFLOPS\n",
      "          (q_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 92.03 us = 0.07% latency, 17.87 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (k_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 301.12 us = 0.23% latency, 5.46 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (v_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 79.87 us = 0.06% latency, 20.59 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (o_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 77.01 us = 0.06% latency, 21.35 TFLOPS, in_features=1792, out_features=1792, bias=False)\n",
      "          (rotary_emb): SteelRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 40.53 us = 0.03% latency, 0 FLOPS)\n",
      "        )\n",
      "        (mlp): SteelSoftMoEV3(\n",
      "          19.28 M = 1.72% Params, 19.27 MMACs = 0.01% MACs, 4.94 ms = 3.8% latency, 11.16 GFLOPS\n",
      "          (experts): ModuleList(\n",
      "            (0): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 597 us = 0.46% latency, 10.76 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 92.51 us = 0.07% latency, 17.36 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.3 us = 0.06% latency, 19.75 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.61 us = 0.05% latency, 24.85 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.7 us = 0.05% latency, 25.61 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 43.63 us = 0.03% latency, 10.27 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 43.15 us = 0.03% latency, 41.53 MFLOPS)\n",
      "            )\n",
      "            (1): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 570.06 us = 0.44% latency, 11.27 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 87.02 us = 0.07% latency, 18.45 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 87.74 us = 0.07% latency, 18.3 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.37 us = 0.05% latency, 24.94 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.04 us = 0.05% latency, 26.31 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.48 us = 0.03% latency, 10.8 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.03% latency, 45.28 MFLOPS)\n",
      "            )\n",
      "            (2): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 566.01 us = 0.44% latency, 11.35 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 85.35 us = 0.07% latency, 18.81 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.21 us = 0.06% latency, 19.3 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.61 us = 0.05% latency, 24.85 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.47 us = 0.05% latency, 25.7 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.96 us = 0.03% latency, 10.68 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.82 us = 0.03% latency, 45.01 MFLOPS)\n",
      "            )\n",
      "            (3): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 551.7 us = 0.42% latency, 11.65 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.73 us = 0.06% latency, 19.41 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 80.59 us = 0.06% latency, 19.92 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.7 us = 0.05% latency, 25.61 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.08 us = 0.05% latency, 26.72 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.48 us = 0.03% latency, 10.8 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.82 us = 0.03% latency, 45.01 MFLOPS)\n",
      "            )\n",
      "            (4): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 563.62 us = 0.43% latency, 11.4 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.45 us = 0.06% latency, 19.24 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.64 us = 0.07% latency, 18.97 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 65.33 us = 0.05% latency, 24.58 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.27 us = 0.05% latency, 26.2 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.72 us = 0.03% latency, 10.74 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.03% latency, 45.28 MFLOPS)\n",
      "            )\n",
      "            (5): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 552.18 us = 0.43% latency, 11.64 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.4 us = 0.07% latency, 19.02 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.78 us = 0.06% latency, 19.63 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.94 us = 0.05% latency, 25.51 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.08 us = 0.05% latency, 26.72 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.53 us = 0.03% latency, 11.05 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.03% latency, 45.55 MFLOPS)\n",
      "            )\n",
      "          )\n",
      "        )\n",
      "        (input_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 158.07 us = 0.12% latency, 0 FLOPS)\n",
      "        (post_attention_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 148.06 us = 0.11% latency, 0 FLOPS)\n",
      "      )\n",
      "      (7): SteelDecoderLayer(\n",
      "        32.13 M = 2.86% Params, 3.54 GMACs = 2.65% MACs, 7.09 ms = 5.46% latency, 1 TFLOPS\n",
      "        (self_attn): SteelSdpaAttention(\n",
      "          12.85 M = 1.14% Params, 3.52 GMACs = 2.64% MACs, 1.48 ms = 1.14% latency, 4.78 TFLOPS\n",
      "          (q_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 309.94 us = 0.24% latency, 5.3 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (k_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 80.59 us = 0.06% latency, 20.4 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (v_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 282.29 us = 0.22% latency, 5.82 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (o_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 77.01 us = 0.06% latency, 21.35 TFLOPS, in_features=1792, out_features=1792, bias=False)\n",
      "          (rotary_emb): SteelRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 41.48 us = 0.03% latency, 0 FLOPS)\n",
      "        )\n",
      "        (mlp): SteelSoftMoEV3(\n",
      "          19.28 M = 1.72% Params, 19.27 MMACs = 0.01% MACs, 4.91 ms = 3.78% latency, 11.21 GFLOPS\n",
      "          (experts): ModuleList(\n",
      "            (0): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 582.22 us = 0.45% latency, 11.03 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 92.27 us = 0.07% latency, 17.4 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.92 us = 0.06% latency, 19.13 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.37 us = 0.05% latency, 24.94 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.18 us = 0.05% latency, 25.41 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 44.11 us = 0.03% latency, 10.16 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.29 us = 0.03% latency, 44.47 MFLOPS)\n",
      "            )\n",
      "            (1): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 564.1 us = 0.43% latency, 11.39 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 85.35 us = 0.07% latency, 18.81 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.49 us = 0.06% latency, 19.46 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.85 us = 0.05% latency, 24.76 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.75 us = 0.05% latency, 26 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.48 us = 0.03% latency, 10.8 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.03% latency, 44.74 MFLOPS)\n",
      "            )\n",
      "            (2): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 566.01 us = 0.44% latency, 11.35 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 97.99 us = 0.08% latency, 16.39 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.02 us = 0.06% latency, 19.58 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.7 us = 0.05% latency, 25.61 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.08 us = 0.05% latency, 26.72 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.25 us = 0.03% latency, 10.86 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.1 us = 0.03% latency, 45.83 MFLOPS)\n",
      "            )\n",
      "            (3): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 550.27 us = 0.42% latency, 11.68 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.97 us = 0.06% latency, 19.35 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.3 us = 0.06% latency, 19.75 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.94 us = 0.05% latency, 25.51 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.08 us = 0.05% latency, 26.72 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.01 us = 0.03% latency, 10.92 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.86 us = 0.03% latency, 46.11 MFLOPS)\n",
      "            )\n",
      "            (4): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 566.01 us = 0.44% latency, 11.35 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.16 us = 0.06% latency, 19.08 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 80.11 us = 0.06% latency, 20.04 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.85 us = 0.05% latency, 24.76 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 67.95 us = 0.05% latency, 23.63 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.25 us = 0.03% latency, 10.86 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.82 us = 0.03% latency, 45.01 MFLOPS)\n",
      "            )\n",
      "            (5): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 564.58 us = 0.43% latency, 11.38 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.64 us = 0.07% latency, 18.97 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.02 us = 0.06% latency, 19.58 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 68.19 us = 0.05% latency, 23.55 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.51 us = 0.05% latency, 26.1 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.96 us = 0.03% latency, 10.68 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.53 us = 0.03% latency, 44.21 MFLOPS)\n",
      "            )\n",
      "          )\n",
      "        )\n",
      "        (input_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 151.16 us = 0.12% latency, 0 FLOPS)\n",
      "        (post_attention_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 143.77 us = 0.11% latency, 0 FLOPS)\n",
      "      )\n",
      "      (8): SteelDecoderLayer(\n",
      "        32.13 M = 2.86% Params, 3.54 GMACs = 2.65% MACs, 6.89 ms = 5.31% latency, 1.03 TFLOPS\n",
      "        (self_attn): SteelSdpaAttention(\n",
      "          12.85 M = 1.14% Params, 3.52 GMACs = 2.64% MACs, 1.49 ms = 1.14% latency, 4.74 TFLOPS\n",
      "          (q_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 90.84 us = 0.07% latency, 18.1 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (k_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 293.02 us = 0.23% latency, 5.61 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (v_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 81.06 us = 0.06% latency, 20.28 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (o_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 76.77 us = 0.06% latency, 21.42 TFLOPS, in_features=1792, out_features=1792, bias=False)\n",
      "          (rotary_emb): SteelRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 40.53 us = 0.03% latency, 0 FLOPS)\n",
      "        )\n",
      "        (mlp): SteelSoftMoEV3(\n",
      "          19.28 M = 1.72% Params, 19.27 MMACs = 0.01% MACs, 4.93 ms = 3.8% latency, 11.17 GFLOPS\n",
      "          (experts): ModuleList(\n",
      "            (0): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 586.99 us = 0.45% latency, 10.95 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 94.89 us = 0.07% latency, 16.92 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.49 us = 0.06% latency, 19.46 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.9 us = 0.05% latency, 25.13 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.23 us = 0.05% latency, 25.8 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 44.35 us = 0.03% latency, 10.1 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.72 us = 0.03% latency, 42.95 MFLOPS)\n",
      "            )\n",
      "            (1): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 562.67 us = 0.43% latency, 11.42 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.4 us = 0.07% latency, 19.02 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.25 us = 0.06% latency, 19.52 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.18 us = 0.05% latency, 25.41 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.08 us = 0.05% latency, 26.72 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.48 us = 0.03% latency, 10.8 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.29 us = 0.03% latency, 44.47 MFLOPS)\n",
      "            )\n",
      "            (2): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 561.95 us = 0.43% latency, 11.43 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.68 us = 0.06% latency, 19.19 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 79.87 us = 0.06% latency, 20.1 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.42 us = 0.05% latency, 25.32 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.85 us = 0.05% latency, 24.76 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.72 us = 0.03% latency, 10.74 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.72 us = 0.03% latency, 42.95 MFLOPS)\n",
      "            )\n",
      "            (3): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 579.6 us = 0.45% latency, 11.08 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 93.94 us = 0.07% latency, 17.09 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 80.35 us = 0.06% latency, 19.98 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.18 us = 0.05% latency, 25.41 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 71.53 us = 0.06% latency, 22.45 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 42.44 us = 0.03% latency, 10.56 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.03% latency, 45.55 MFLOPS)\n",
      "            )\n",
      "            (4): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 552.18 us = 0.43% latency, 11.64 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.88 us = 0.07% latency, 18.92 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.06 us = 0.06% latency, 19.81 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.7 us = 0.05% latency, 25.61 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.08 us = 0.05% latency, 26.72 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.25 us = 0.03% latency, 10.86 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.03% latency, 45.28 MFLOPS)\n",
      "            )\n",
      "            (5): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 545.98 us = 0.42% latency, 11.77 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.73 us = 0.06% latency, 19.41 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 79.63 us = 0.06% latency, 20.16 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.7 us = 0.05% latency, 25.61 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.8 us = 0.05% latency, 26.41 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.82 us = 0.03% latency, 11.25 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.03% latency, 45.55 MFLOPS)\n",
      "            )\n",
      "          )\n",
      "        )\n",
      "        (input_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 155.21 us = 0.12% latency, 0 FLOPS)\n",
      "        (post_attention_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 144.48 us = 0.11% latency, 0 FLOPS)\n",
      "      )\n",
      "      (9): SteelDecoderLayer(\n",
      "        32.13 M = 2.86% Params, 3.54 GMACs = 2.65% MACs, 7.06 ms = 5.43% latency, 1.01 TFLOPS\n",
      "        (self_attn): SteelSdpaAttention(\n",
      "          12.85 M = 1.14% Params, 3.52 GMACs = 2.64% MACs, 1.46 ms = 1.12% latency, 4.84 TFLOPS\n",
      "          (q_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 297.31 us = 0.23% latency, 5.53 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (k_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 80.35 us = 0.06% latency, 20.46 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (v_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 286.34 us = 0.22% latency, 5.74 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (o_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 77.25 us = 0.06% latency, 21.28 TFLOPS, in_features=1792, out_features=1792, bias=False)\n",
      "          (rotary_emb): SteelRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 41.96 us = 0.03% latency, 0 FLOPS)\n",
      "        )\n",
      "        (mlp): SteelSoftMoEV3(\n",
      "          19.28 M = 1.72% Params, 19.27 MMACs = 0.01% MACs, 4.92 ms = 3.79% latency, 11.19 GFLOPS\n",
      "          (experts): ModuleList(\n",
      "            (0): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 595.09 us = 0.46% latency, 10.8 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 95.84 us = 0.07% latency, 16.75 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.64 us = 0.07% latency, 18.97 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 66.04 us = 0.05% latency, 24.31 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.47 us = 0.05% latency, 25.7 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 45.78 us = 0.04% latency, 9.79 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.53 us = 0.03% latency, 44.21 MFLOPS)\n",
      "            )\n",
      "            (1): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 568.87 us = 0.44% latency, 11.29 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 85.12 us = 0.07% latency, 18.86 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.68 us = 0.06% latency, 19.19 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.37 us = 0.05% latency, 24.94 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.32 us = 0.05% latency, 26.62 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 46.97 us = 0.04% latency, 9.54 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.29 us = 0.03% latency, 44.47 MFLOPS)\n",
      "            )\n",
      "            (2): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 557.66 us = 0.43% latency, 11.52 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.88 us = 0.07% latency, 18.92 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.06 us = 0.06% latency, 19.81 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.7 us = 0.05% latency, 25.61 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.27 us = 0.05% latency, 26.2 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.72 us = 0.03% latency, 10.74 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.03% latency, 44.74 MFLOPS)\n",
      "            )\n",
      "            (3): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 548.36 us = 0.42% latency, 11.72 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.97 us = 0.06% latency, 19.35 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 79.63 us = 0.06% latency, 20.16 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.18 us = 0.05% latency, 25.41 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.32 us = 0.05% latency, 26.62 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.29 us = 0.03% latency, 11.12 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.1 us = 0.03% latency, 45.83 MFLOPS)\n",
      "            )\n",
      "            (4): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 549.55 us = 0.42% latency, 11.69 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.45 us = 0.06% latency, 19.24 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 79.15 us = 0.06% latency, 20.28 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.18 us = 0.05% latency, 25.41 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.37 us = 0.05% latency, 27.05 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.53 us = 0.03% latency, 11.05 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.03% latency, 45.55 MFLOPS)\n",
      "            )\n",
      "            (5): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 570.54 us = 0.44% latency, 11.26 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.21 us = 0.06% latency, 19.3 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 91.08 us = 0.07% latency, 17.63 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 65.09 us = 0.05% latency, 24.67 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.23 us = 0.05% latency, 25.8 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.53 us = 0.03% latency, 11.05 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.03% latency, 44.74 MFLOPS)\n",
      "            )\n",
      "          )\n",
      "        )\n",
      "        (input_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 148.53 us = 0.11% latency, 0 FLOPS)\n",
      "        (post_attention_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 144.72 us = 0.11% latency, 0 FLOPS)\n",
      "      )\n",
      "      (10): SteelDecoderLayer(\n",
      "        32.13 M = 2.86% Params, 3.54 GMACs = 2.65% MACs, 6.75 ms = 5.2% latency, 1.05 TFLOPS\n",
      "        (self_attn): SteelSdpaAttention(\n",
      "          12.85 M = 1.14% Params, 3.52 GMACs = 2.64% MACs, 1.45 ms = 1.12% latency, 4.86 TFLOPS\n",
      "          (q_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 89.65 us = 0.07% latency, 18.34 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (k_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 291.35 us = 0.22% latency, 5.64 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (v_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 79.15 us = 0.06% latency, 20.77 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (o_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 77.49 us = 0.06% latency, 21.22 TFLOPS, in_features=1792, out_features=1792, bias=False)\n",
      "          (rotary_emb): SteelRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.03% latency, 0 FLOPS)\n",
      "        )\n",
      "        (mlp): SteelSoftMoEV3(\n",
      "          19.28 M = 1.72% Params, 19.27 MMACs = 0.01% MACs, 4.83 ms = 3.72% latency, 11.39 GFLOPS\n",
      "          (experts): ModuleList(\n",
      "            (0): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 586.51 us = 0.45% latency, 10.95 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 95.37 us = 0.07% latency, 16.84 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.06 us = 0.06% latency, 19.81 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.42 us = 0.05% latency, 25.32 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.94 us = 0.05% latency, 25.51 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.96 us = 0.03% latency, 10.68 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 51.02 us = 0.04% latency, 35.12 MFLOPS)\n",
      "            )\n",
      "            (1): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 552.89 us = 0.43% latency, 11.62 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 85.35 us = 0.07% latency, 18.81 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 80.82 us = 0.06% latency, 19.87 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.7 us = 0.05% latency, 25.61 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.32 us = 0.05% latency, 26.62 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.01 us = 0.03% latency, 10.92 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.1 us = 0.03% latency, 45.83 MFLOPS)\n",
      "            )\n",
      "            (2): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 562.19 us = 0.43% latency, 11.43 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.45 us = 0.06% latency, 19.24 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 89.88 us = 0.07% latency, 17.86 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.37 us = 0.05% latency, 24.94 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.32 us = 0.05% latency, 26.62 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.77 us = 0.03% latency, 10.99 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.1 us = 0.03% latency, 45.83 MFLOPS)\n",
      "            )\n",
      "            (3): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 542.16 us = 0.42% latency, 11.85 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.92 us = 0.06% latency, 19.13 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 80.11 us = 0.06% latency, 20.04 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.47 us = 0.05% latency, 25.7 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.37 us = 0.05% latency, 27.05 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.82 us = 0.03% latency, 11.25 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.86 us = 0.03% latency, 46.11 MFLOPS)\n",
      "            )\n",
      "            (4): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 565.53 us = 0.44% latency, 11.36 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.25 us = 0.06% latency, 19.52 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 79.39 us = 0.06% latency, 20.22 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.06 us = 0.06% latency, 19.81 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.32 us = 0.05% latency, 26.62 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.29 us = 0.03% latency, 11.12 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.29 us = 0.03% latency, 44.47 MFLOPS)\n",
      "            )\n",
      "            (5): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 544.55 us = 0.42% latency, 11.8 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.97 us = 0.06% latency, 19.35 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 80.35 us = 0.06% latency, 19.98 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.47 us = 0.05% latency, 25.7 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.37 us = 0.05% latency, 27.05 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.77 us = 0.03% latency, 10.99 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.15 us = 0.03% latency, 46.98 MFLOPS)\n",
      "            )\n",
      "          )\n",
      "        )\n",
      "        (input_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 152.11 us = 0.12% latency, 0 FLOPS)\n",
      "        (post_attention_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 142.1 us = 0.11% latency, 0 FLOPS)\n",
      "      )\n",
      "      (11): SteelDecoderLayer(\n",
      "        32.13 M = 2.86% Params, 3.54 GMACs = 2.65% MACs, 7.1 ms = 5.47% latency, 1 TFLOPS\n",
      "        (self_attn): SteelSdpaAttention(\n",
      "          12.85 M = 1.14% Params, 3.52 GMACs = 2.64% MACs, 1.46 ms = 1.12% latency, 4.83 TFLOPS\n",
      "          (q_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 305.18 us = 0.24% latency, 5.39 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (k_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 79.39 us = 0.06% latency, 20.71 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (v_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 272.75 us = 0.21% latency, 6.03 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (o_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 76.06 us = 0.06% latency, 21.62 TFLOPS, in_features=1792, out_features=1792, bias=False)\n",
      "          (rotary_emb): SteelRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 38.86 us = 0.03% latency, 0 FLOPS)\n",
      "        )\n",
      "        (mlp): SteelSoftMoEV3(\n",
      "          19.28 M = 1.72% Params, 19.27 MMACs = 0.01% MACs, 4.95 ms = 3.81% latency, 11.13 GFLOPS\n",
      "          (experts): ModuleList(\n",
      "            (0): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 577.69 us = 0.44% latency, 11.12 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 92.74 us = 0.07% latency, 17.31 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.02 us = 0.06% latency, 19.58 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.66 us = 0.05% latency, 25.22 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.04 us = 0.05% latency, 26.31 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 43.63 us = 0.03% latency, 10.27 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.82 us = 0.03% latency, 45.01 MFLOPS)\n",
      "            )\n",
      "            (1): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 550.03 us = 0.42% latency, 11.68 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.64 us = 0.07% latency, 18.97 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.06 us = 0.06% latency, 19.81 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.47 us = 0.05% latency, 25.7 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.84 us = 0.05% latency, 26.83 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.72 us = 0.03% latency, 10.74 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.03% latency, 44.74 MFLOPS)\n",
      "            )\n",
      "            (2): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 571.01 us = 0.44% latency, 11.25 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.21 us = 0.06% latency, 19.3 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 94.18 us = 0.07% latency, 17.05 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.13 us = 0.05% latency, 25.04 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.23 us = 0.05% latency, 25.8 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.03% latency, 11.32 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 43.87 us = 0.03% latency, 40.85 MFLOPS)\n",
      "            )\n",
      "            (3): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 658.75 us = 0.51% latency, 9.75 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 87.5 us = 0.07% latency, 18.35 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.4 us = 0.07% latency, 19.02 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.23 us = 0.05% latency, 25.8 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.8 us = 0.05% latency, 26.41 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 114.92 us = 0.09% latency, 3.9 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.03% latency, 44.74 MFLOPS)\n",
      "            )\n",
      "            (4): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 551.22 us = 0.42% latency, 11.66 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.45 us = 0.06% latency, 19.24 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 80.35 us = 0.06% latency, 19.98 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.7 us = 0.05% latency, 25.61 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.75 us = 0.05% latency, 26 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.77 us = 0.03% latency, 10.99 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.25 us = 0.03% latency, 43.45 MFLOPS)\n",
      "            )\n",
      "            (5): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 548.12 us = 0.42% latency, 11.72 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.45 us = 0.06% latency, 19.24 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 78.92 us = 0.06% latency, 20.35 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 66.28 us = 0.05% latency, 24.22 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.08 us = 0.05% latency, 26.72 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.03% latency, 11.39 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.53 us = 0.03% latency, 44.21 MFLOPS)\n",
      "            )\n",
      "          )\n",
      "        )\n",
      "        (input_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 147.58 us = 0.11% latency, 0 FLOPS)\n",
      "        (post_attention_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 140.67 us = 0.11% latency, 0 FLOPS)\n",
      "      )\n",
      "      (12): SteelDecoderLayer(\n",
      "        32.13 M = 2.86% Params, 3.54 GMACs = 2.65% MACs, 6.77 ms = 5.21% latency, 1.05 TFLOPS\n",
      "        (self_attn): SteelSdpaAttention(\n",
      "          12.85 M = 1.14% Params, 3.52 GMACs = 2.64% MACs, 1.46 ms = 1.13% latency, 4.82 TFLOPS\n",
      "          (q_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 88.45 us = 0.07% latency, 18.59 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (k_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 291.35 us = 0.22% latency, 5.64 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (v_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 77.72 us = 0.06% latency, 21.15 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (o_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 76.77 us = 0.06% latency, 21.42 TFLOPS, in_features=1792, out_features=1792, bias=False)\n",
      "          (rotary_emb): SteelRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 39.1 us = 0.03% latency, 0 FLOPS)\n",
      "        )\n",
      "        (mlp): SteelSoftMoEV3(\n",
      "          19.28 M = 1.72% Params, 19.27 MMACs = 0.01% MACs, 4.83 ms = 3.72% latency, 11.4 GFLOPS\n",
      "          (experts): ModuleList(\n",
      "            (0): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 567.91 us = 0.44% latency, 11.31 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 93.7 us = 0.07% latency, 17.14 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 80.82 us = 0.06% latency, 19.87 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.18 us = 0.05% latency, 25.41 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.84 us = 0.05% latency, 26.83 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 42.44 us = 0.03% latency, 10.56 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.03% latency, 45.55 MFLOPS)\n",
      "            )\n",
      "            (1): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 560.28 us = 0.43% latency, 11.47 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 90.84 us = 0.07% latency, 17.68 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.78 us = 0.06% latency, 19.63 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.47 us = 0.05% latency, 25.7 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.08 us = 0.05% latency, 26.72 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.72 us = 0.03% latency, 10.74 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.03% latency, 45.55 MFLOPS)\n",
      "            )\n",
      "            (2): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 554.8 us = 0.43% latency, 11.58 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.45 us = 0.06% latency, 19.24 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 80.35 us = 0.06% latency, 19.98 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 65.33 us = 0.05% latency, 24.58 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.56 us = 0.05% latency, 26.51 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.48 us = 0.03% latency, 10.8 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.82 us = 0.03% latency, 45.01 MFLOPS)\n",
      "            )\n",
      "            (3): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 547.65 us = 0.42% latency, 11.73 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.97 us = 0.06% latency, 19.35 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 80.35 us = 0.06% latency, 19.98 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.9 us = 0.05% latency, 25.13 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.08 us = 0.05% latency, 26.72 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.29 us = 0.03% latency, 11.12 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.86 us = 0.03% latency, 46.11 MFLOPS)\n",
      "            )\n",
      "            (4): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 558.38 us = 0.43% latency, 11.51 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 92.03 us = 0.07% latency, 17.45 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.25 us = 0.06% latency, 19.52 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.75 us = 0.05% latency, 26 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.6 us = 0.05% latency, 26.94 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.96 us = 0.03% latency, 10.68 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.15 us = 0.03% latency, 46.98 MFLOPS)\n",
      "            )\n",
      "            (5): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 543.36 us = 0.42% latency, 11.82 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.49 us = 0.06% latency, 19.46 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 80.35 us = 0.06% latency, 19.98 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.51 us = 0.05% latency, 26.1 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.6 us = 0.05% latency, 26.94 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.77 us = 0.03% latency, 10.99 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.03% latency, 45.55 MFLOPS)\n",
      "            )\n",
      "          )\n",
      "        )\n",
      "        (input_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 153.78 us = 0.12% latency, 0 FLOPS)\n",
      "        (post_attention_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 144.48 us = 0.11% latency, 0 FLOPS)\n",
      "      )\n",
      "      (13): SteelDecoderLayer(\n",
      "        32.13 M = 2.86% Params, 3.54 GMACs = 2.65% MACs, 6.94 ms = 5.34% latency, 1.02 TFLOPS\n",
      "        (self_attn): SteelSdpaAttention(\n",
      "          12.85 M = 1.14% Params, 3.52 GMACs = 2.64% MACs, 1.45 ms = 1.11% latency, 4.88 TFLOPS\n",
      "          (q_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 290.63 us = 0.22% latency, 5.66 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (k_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 79.87 us = 0.06% latency, 20.59 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (v_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 283 us = 0.22% latency, 5.81 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (o_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 75.82 us = 0.06% latency, 21.69 TFLOPS, in_features=1792, out_features=1792, bias=False)\n",
      "          (rotary_emb): SteelRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 40.29 us = 0.03% latency, 0 FLOPS)\n",
      "        )\n",
      "        (mlp): SteelSoftMoEV3(\n",
      "          19.28 M = 1.72% Params, 19.27 MMACs = 0.01% MACs, 4.81 ms = 3.71% latency, 11.44 GFLOPS\n",
      "          (experts): ModuleList(\n",
      "            (0): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 579.6 us = 0.45% latency, 11.08 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 93.22 us = 0.07% latency, 17.22 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.49 us = 0.06% latency, 19.46 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.37 us = 0.05% latency, 24.94 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.27 us = 0.05% latency, 26.2 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 42.68 us = 0.03% latency, 10.5 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.03% latency, 44.74 MFLOPS)\n",
      "            )\n",
      "            (1): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 550.75 us = 0.42% latency, 11.67 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.64 us = 0.07% latency, 18.97 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 79.87 us = 0.06% latency, 20.1 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.94 us = 0.05% latency, 25.51 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.56 us = 0.05% latency, 26.51 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.77 us = 0.03% latency, 10.99 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.86 us = 0.03% latency, 46.11 MFLOPS)\n",
      "            )\n",
      "            (2): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 544.07 us = 0.42% latency, 11.81 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.21 us = 0.06% latency, 19.3 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 80.11 us = 0.06% latency, 20.04 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.47 us = 0.05% latency, 25.7 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.84 us = 0.05% latency, 26.83 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.29 us = 0.03% latency, 11.12 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.86 us = 0.03% latency, 46.11 MFLOPS)\n",
      "            )\n",
      "            (3): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 538.83 us = 0.41% latency, 11.92 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.25 us = 0.06% latency, 19.52 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 78.92 us = 0.06% latency, 20.35 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.51 us = 0.05% latency, 26.1 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 58.89 us = 0.05% latency, 27.27 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.03% latency, 11.39 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.39 us = 0.03% latency, 46.68 MFLOPS)\n",
      "            )\n",
      "            (4): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 544.79 us = 0.42% latency, 11.79 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.21 us = 0.06% latency, 19.3 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 86.78 us = 0.07% latency, 18.5 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.27 us = 0.05% latency, 26.2 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.37 us = 0.05% latency, 27.05 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.1 us = 0.03% latency, 11.46 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.1 us = 0.03% latency, 45.83 MFLOPS)\n",
      "            )\n",
      "            (5): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 550.75 us = 0.42% latency, 11.67 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.49 us = 0.06% latency, 19.46 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 77.96 us = 0.06% latency, 20.59 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.27 us = 0.05% latency, 26.2 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.32 us = 0.05% latency, 26.62 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.03% latency, 11.18 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.1 us = 0.03% latency, 45.83 MFLOPS)\n",
      "            )\n",
      "          )\n",
      "        )\n",
      "        (input_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 147.34 us = 0.11% latency, 0 FLOPS)\n",
      "        (post_attention_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 141.86 us = 0.11% latency, 0 FLOPS)\n",
      "      )\n",
      "      (14): SteelDecoderLayer(\n",
      "        32.13 M = 2.86% Params, 3.54 GMACs = 2.65% MACs, 6.74 ms = 5.19% latency, 1.05 TFLOPS\n",
      "        (self_attn): SteelSdpaAttention(\n",
      "          12.85 M = 1.14% Params, 3.52 GMACs = 2.64% MACs, 1.44 ms = 1.11% latency, 4.88 TFLOPS\n",
      "          (q_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 87.02 us = 0.07% latency, 18.89 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (k_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 290.39 us = 0.22% latency, 5.66 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (v_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 79.87 us = 0.06% latency, 20.59 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (o_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 76.29 us = 0.06% latency, 21.55 TFLOPS, in_features=1792, out_features=1792, bias=False)\n",
      "          (rotary_emb): SteelRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 41.25 us = 0.03% latency, 0 FLOPS)\n",
      "        )\n",
      "        (mlp): SteelSoftMoEV3(\n",
      "          19.28 M = 1.72% Params, 19.27 MMACs = 0.01% MACs, 4.83 ms = 3.72% latency, 11.41 GFLOPS\n",
      "          (experts): ModuleList(\n",
      "            (0): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 577.69 us = 0.44% latency, 11.12 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 92.98 us = 0.07% latency, 17.27 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.49 us = 0.06% latency, 19.46 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.66 us = 0.05% latency, 25.22 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.04 us = 0.05% latency, 26.31 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 43.87 us = 0.03% latency, 10.21 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.03% latency, 45.28 MFLOPS)\n",
      "            )\n",
      "            (1): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 567.67 us = 0.44% latency, 11.32 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 86.55 us = 0.07% latency, 18.55 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.25 us = 0.06% latency, 19.52 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.94 us = 0.05% latency, 25.51 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.08 us = 0.05% latency, 26.72 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 43.39 us = 0.03% latency, 10.32 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.03% latency, 44.74 MFLOPS)\n",
      "            )\n",
      "            (2): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 559.09 us = 0.43% latency, 11.49 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.64 us = 0.07% latency, 18.97 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 79.63 us = 0.06% latency, 20.16 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 68.43 us = 0.05% latency, 23.47 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.04 us = 0.05% latency, 26.31 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.77 us = 0.03% latency, 10.99 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.53 us = 0.03% latency, 44.21 MFLOPS)\n",
      "            )\n",
      "            (3): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 553.85 us = 0.43% latency, 11.6 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.25 us = 0.06% latency, 19.52 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.3 us = 0.06% latency, 19.75 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.94 us = 0.05% latency, 25.51 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.75 us = 0.05% latency, 26 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.25 us = 0.03% latency, 10.86 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.03% latency, 45.55 MFLOPS)\n",
      "            )\n",
      "            (4): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 546.22 us = 0.42% latency, 11.76 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.92 us = 0.06% latency, 19.13 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 80.59 us = 0.06% latency, 19.92 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.23 us = 0.05% latency, 25.8 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.56 us = 0.05% latency, 26.51 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.53 us = 0.03% latency, 11.05 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.03% latency, 45.55 MFLOPS)\n",
      "            )\n",
      "            (5): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 537.16 us = 0.41% latency, 11.96 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.54 us = 0.06% latency, 19.69 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 80.59 us = 0.06% latency, 19.92 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.27 us = 0.05% latency, 26.2 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.6 us = 0.05% latency, 26.94 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.86 us = 0.03% latency, 11.53 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.91 us = 0.03% latency, 47.27 MFLOPS)\n",
      "            )\n",
      "          )\n",
      "        )\n",
      "        (input_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 154.5 us = 0.12% latency, 0 FLOPS)\n",
      "        (post_attention_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 145.44 us = 0.11% latency, 0 FLOPS)\n",
      "      )\n",
      "      (15): SteelDecoderLayer(\n",
      "        32.13 M = 2.86% Params, 3.54 GMACs = 2.65% MACs, 6.98 ms = 5.37% latency, 1.02 TFLOPS\n",
      "        (self_attn): SteelSdpaAttention(\n",
      "          12.85 M = 1.14% Params, 3.52 GMACs = 2.64% MACs, 1.47 ms = 1.13% latency, 4.79 TFLOPS\n",
      "          (q_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 305.41 us = 0.24% latency, 5.38 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (k_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 78.44 us = 0.06% latency, 20.96 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (v_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 293.25 us = 0.23% latency, 5.61 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (o_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 75.1 us = 0.06% latency, 21.89 TFLOPS, in_features=1792, out_features=1792, bias=False)\n",
      "          (rotary_emb): SteelRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 41.25 us = 0.03% latency, 0 FLOPS)\n",
      "        )\n",
      "        (mlp): SteelSoftMoEV3(\n",
      "          19.28 M = 1.72% Params, 19.27 MMACs = 0.01% MACs, 4.82 ms = 3.71% latency, 11.43 GFLOPS\n",
      "          (experts): ModuleList(\n",
      "            (0): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 586.03 us = 0.45% latency, 10.96 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 92.74 us = 0.07% latency, 17.31 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.54 us = 0.06% latency, 19.69 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.13 us = 0.05% latency, 25.04 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 68.66 us = 0.05% latency, 23.38 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 43.63 us = 0.03% latency, 10.27 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.03% latency, 44.74 MFLOPS)\n",
      "            )\n",
      "            (1): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 559.33 us = 0.43% latency, 11.49 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 85.59 us = 0.07% latency, 18.76 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.45 us = 0.06% latency, 19.24 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.42 us = 0.05% latency, 25.32 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.56 us = 0.05% latency, 26.51 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.72 us = 0.03% latency, 10.74 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.03% latency, 45.55 MFLOPS)\n",
      "            )\n",
      "            (2): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 558.61 us = 0.43% latency, 11.5 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.25 us = 0.06% latency, 19.52 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 79.87 us = 0.06% latency, 20.1 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.99 us = 0.05% latency, 25.9 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 69.14 us = 0.05% latency, 23.22 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.72 us = 0.03% latency, 10.74 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.86 us = 0.03% latency, 46.11 MFLOPS)\n",
      "            )\n",
      "            (3): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 543.83 us = 0.42% latency, 11.81 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.45 us = 0.06% latency, 19.24 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 79.87 us = 0.06% latency, 20.1 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.99 us = 0.05% latency, 25.9 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.13 us = 0.05% latency, 27.16 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.01 us = 0.03% latency, 10.92 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.86 us = 0.03% latency, 46.11 MFLOPS)\n",
      "            )\n",
      "            (4): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 539.54 us = 0.42% latency, 11.91 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.3 us = 0.06% latency, 19.75 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 79.15 us = 0.06% latency, 20.28 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.99 us = 0.05% latency, 25.9 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.37 us = 0.05% latency, 27.05 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.77 us = 0.03% latency, 10.99 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.15 us = 0.03% latency, 46.98 MFLOPS)\n",
      "            )\n",
      "            (5): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 543.83 us = 0.42% latency, 11.81 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.73 us = 0.06% latency, 19.41 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 78.44 us = 0.06% latency, 20.47 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.04 us = 0.05% latency, 26.31 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 67.47 us = 0.05% latency, 23.8 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.03% latency, 11.32 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.67 us = 0.03% latency, 47.57 MFLOPS)\n",
      "            )\n",
      "          )\n",
      "        )\n",
      "        (input_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 147.58 us = 0.11% latency, 0 FLOPS)\n",
      "        (post_attention_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 143.77 us = 0.11% latency, 0 FLOPS)\n",
      "      )\n",
      "      (16): SteelDecoderLayer(\n",
      "        32.13 M = 2.86% Params, 3.54 GMACs = 2.65% MACs, 6.8 ms = 5.24% latency, 1.04 TFLOPS\n",
      "        (self_attn): SteelSdpaAttention(\n",
      "          12.85 M = 1.14% Params, 3.52 GMACs = 2.64% MACs, 1.45 ms = 1.12% latency, 4.84 TFLOPS\n",
      "          (q_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 90.6 us = 0.07% latency, 18.15 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (k_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 291.11 us = 0.22% latency, 5.65 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (v_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 78.2 us = 0.06% latency, 21.02 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (o_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 77.01 us = 0.06% latency, 21.35 TFLOPS, in_features=1792, out_features=1792, bias=False)\n",
      "          (rotary_emb): SteelRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.03% latency, 0 FLOPS)\n",
      "        )\n",
      "        (mlp): SteelSoftMoEV3(\n",
      "          19.28 M = 1.72% Params, 19.27 MMACs = 0.01% MACs, 4.87 ms = 3.75% latency, 11.31 GFLOPS\n",
      "          (experts): ModuleList(\n",
      "            (0): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 584.13 us = 0.45% latency, 11 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 93.7 us = 0.07% latency, 17.14 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.54 us = 0.06% latency, 19.69 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 64.61 us = 0.05% latency, 24.85 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.94 us = 0.05% latency, 25.51 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 42.68 us = 0.03% latency, 10.5 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.77 us = 0.03% latency, 43.95 MFLOPS)\n",
      "            )\n",
      "            (1): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 554.8 us = 0.43% latency, 11.58 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 86.55 us = 0.07% latency, 18.55 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.3 us = 0.06% latency, 19.75 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.66 us = 0.05% latency, 25.22 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 58.41 us = 0.04% latency, 27.49 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.72 us = 0.03% latency, 10.74 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.03% latency, 45.28 MFLOPS)\n",
      "            )\n",
      "            (2): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 553.61 us = 0.43% latency, 11.61 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.68 us = 0.06% latency, 19.19 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.25 us = 0.06% latency, 19.52 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.99 us = 0.05% latency, 25.9 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.84 us = 0.05% latency, 26.83 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.77 us = 0.03% latency, 10.99 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.03% latency, 45.55 MFLOPS)\n",
      "            )\n",
      "            (3): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 545.98 us = 0.42% latency, 11.77 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.4 us = 0.07% latency, 19.02 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 79.15 us = 0.06% latency, 20.28 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.75 us = 0.05% latency, 26 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.84 us = 0.05% latency, 26.83 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.03% latency, 11.32 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.03% latency, 45.55 MFLOPS)\n",
      "            )\n",
      "            (4): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 568.63 us = 0.44% latency, 11.3 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.49 us = 0.06% latency, 19.46 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 81.06 us = 0.06% latency, 19.81 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.18 us = 0.05% latency, 25.41 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 58.65 us = 0.05% latency, 27.38 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 46.73 us = 0.04% latency, 9.59 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.86 us = 0.03% latency, 46.11 MFLOPS)\n",
      "            )\n",
      "            (5): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 546.22 us = 0.42% latency, 11.76 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.45 us = 0.06% latency, 19.24 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 80.82 us = 0.06% latency, 19.87 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 62.7 us = 0.05% latency, 25.61 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.08 us = 0.05% latency, 26.72 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.77 us = 0.03% latency, 10.99 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.03% latency, 44.74 MFLOPS)\n",
      "            )\n",
      "          )\n",
      "        )\n",
      "        (input_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 153.3 us = 0.12% latency, 0 FLOPS)\n",
      "        (post_attention_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 143.05 us = 0.11% latency, 0 FLOPS)\n",
      "      )\n",
      "      (17): SteelDecoderLayer(\n",
      "        32.13 M = 2.86% Params, 3.54 GMACs = 2.65% MACs, 6.96 ms = 5.36% latency, 1.02 TFLOPS\n",
      "        (self_attn): SteelSdpaAttention(\n",
      "          12.85 M = 1.14% Params, 3.52 GMACs = 2.64% MACs, 1.46 ms = 1.13% latency, 4.81 TFLOPS\n",
      "          (q_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 298.5 us = 0.23% latency, 5.51 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (k_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 78.2 us = 0.06% latency, 21.02 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (v_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 295.16 us = 0.23% latency, 5.57 TFLOPS, in_features=1792, out_features=1792, bias=True)\n",
      "          (o_proj): Linear(3.21 M = 0.29% Params, 822.08 MMACs = 0.62% MACs, 75.34 us = 0.06% latency, 21.82 TFLOPS, in_features=1792, out_features=1792, bias=False)\n",
      "          (rotary_emb): SteelRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.03% latency, 0 FLOPS)\n",
      "        )\n",
      "        (mlp): SteelSoftMoEV3(\n",
      "          19.28 M = 1.72% Params, 19.27 MMACs = 0.01% MACs, 4.8 ms = 3.7% latency, 11.46 GFLOPS\n",
      "          (experts): ModuleList(\n",
      "            (0): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 572.92 us = 0.44% latency, 11.21 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 93.22 us = 0.07% latency, 17.22 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.45 us = 0.06% latency, 19.24 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.66 us = 0.05% latency, 25.22 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.13 us = 0.05% latency, 27.16 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 43.87 us = 0.03% latency, 10.21 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.1 us = 0.03% latency, 45.83 MFLOPS)\n",
      "            )\n",
      "            (1): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 547.89 us = 0.42% latency, 11.73 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.68 us = 0.06% latency, 19.19 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 79.63 us = 0.06% latency, 20.16 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.18 us = 0.05% latency, 25.41 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.6 us = 0.05% latency, 26.94 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.53 us = 0.03% latency, 11.05 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.03% latency, 45.28 MFLOPS)\n",
      "            )\n",
      "            (2): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 558.61 us = 0.43% latency, 11.5 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.21 us = 0.06% latency, 19.3 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.88 us = 0.07% latency, 18.92 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 63.66 us = 0.05% latency, 25.22 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.27 us = 0.05% latency, 26.2 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.01 us = 0.03% latency, 10.92 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.82 us = 0.03% latency, 45.01 MFLOPS)\n",
      "            )\n",
      "            (3): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 545.26 us = 0.42% latency, 11.78 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 83.92 us = 0.06% latency, 19.13 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 79.39 us = 0.06% latency, 20.22 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.8 us = 0.05% latency, 26.41 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.6 us = 0.05% latency, 26.94 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.77 us = 0.03% latency, 10.99 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.29 us = 0.03% latency, 44.47 MFLOPS)\n",
      "            )\n",
      "            (4): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 541.21 us = 0.42% latency, 11.87 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 84.16 us = 0.06% latency, 19.08 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 78.92 us = 0.06% latency, 20.35 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 61.75 us = 0.05% latency, 26 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.13 us = 0.05% latency, 27.16 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.03% latency, 11.32 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.62 us = 0.03% latency, 46.4 MFLOPS)\n",
      "            )\n",
      "            (5): SteelSENet(\n",
      "              3.21 M = 0.29% Params, 3.21 MMACs = 0% MACs, 537.63 us = 0.41% latency, 11.95 GFLOPS\n",
      "              (gate_up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 82.73 us = 0.06% latency, 19.41 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (up_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 78.2 us = 0.06% latency, 20.53 GFLOPS, in_features=1792, out_features=448, bias=False)\n",
      "              (gate_down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 60.32 us = 0.05% latency, 26.62 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (down_proj): Linear(802.82 K = 0.07% Params, 802.82 KMACs = 0% MACs, 59.13 us = 0.05% latency, 27.16 GFLOPS, in_features=448, out_features=1792, bias=False)\n",
      "              (act_fn1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.03% latency, 11.32 MFLOPS)\n",
      "              (act_fn2): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.67 us = 0.03% latency, 47.57 MFLOPS)\n",
      "            )\n",
      "          )\n",
      "        )\n",
      "        (input_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 148.53 us = 0.11% latency, 0 FLOPS)\n",
      "        (post_attention_layernorm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 144.48 us = 0.11% latency, 0 FLOPS)\n",
      "      )\n",
      "    )\n",
      "    (norm): SteelRMSNorm(1.79 K = 0% Params, 0 MACs = 0% MACs, 151.4 us = 0.12% latency, 0 FLOPS)\n",
      "  )\n",
      "  (lm_head): Linear(272.27 M = 24.25% Params, 69.7 GMACs = 52.22% MACs, 703.81 us = 0.54% latency, 198.07 TFLOPS, in_features=1792, out_features=151936, bias=False)\n",
      ")\n",
      "------------------------------------------------------------------------------\n",
      "[2024-05-18 14:54:10,200] [INFO] [profiler.py:226:end_profile] Flops profiler finished\n",
      "FLOPs: 267.23 G\n",
      "MACs: 133.47 GMACs\n",
      "Params: 1.12 B\n",
      "tensor([[[ 1.0625, -0.3398,  0.4121,  ..., -1.6406,  0.1035,  0.3086],\n",
      "         [ 0.8750,  0.0303,  0.8242,  ..., -1.4219,  0.0598, -0.3262],\n",
      "         [ 0.9727,  0.1807,  0.7578,  ..., -1.2266,  0.2373, -0.5156],\n",
      "         ...,\n",
      "         [-0.1377, -0.5273,  1.3750,  ..., -0.0349,  0.1826, -0.3809],\n",
      "         [-0.1533, -0.5742,  1.3750,  ..., -0.0625,  0.1641, -0.4180],\n",
      "         [-0.1719, -0.5117,  1.3438,  ..., -0.0645,  0.1465, -0.3516]]],\n",
      "       device='cuda:0', grad_fn=<SliceBackward0>)\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# RuntimeError: FlashAttention only support fp16 and bf16 data type\n",
    "seed = 666\n",
    "torch.manual_seed(seed)  # 设置随机种子\n",
    "torch.cuda.manual_seed(seed)  # 设置CUDA的随机种子\n",
    "torch.cuda.manual_seed_all(seed)  # 如果使用多个GPU，设置所有GPU的随机种子\n",
    "np.random.seed(seed)\n",
    "fake_input = torch.randint(1, 10000, (1, 256),dtype=torch.long, device=model.device)\n",
    "input_dict = {\"input_ids\": fake_input, \"labels\": fake_input.clone()}\n",
    "# model = model.to(torch.float32)\n",
    "# flops, macs, params = get_model_profile(model, kwargs=input_dict, print_profile=True, detailed=True)\n",
    "# print(\"FLOPs:\", flops)\n",
    "# print(\"MACs:\", macs)\n",
    "# print(\"Params:\", params)\n",
    "# output = model(fake_input)\n",
    "# print(output.logits[:,:,:10])\n",
    "# print(\"*\"*20)\n",
    "#===========================================================\n",
    "model = model.to(torch.bfloat16)\n",
    "flops, macs, params = get_model_profile(model, kwargs=input_dict, print_profile=True, detailed=True)\n",
    "print(\"FLOPs:\", flops)\n",
    "print(\"MACs:\", macs)\n",
    "print(\"Params:\", params)\n",
    "output = model(fake_input)\n",
    "print(output.logits[:,:,:10])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 测试 rms+rope一致性"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from flash_attn.layers.rotary import apply_rotary_emb_func as __apply_rotary_emb_func\n",
    "from flash_attn.ops.rms_norm import rms_norm as __rms_norm\n",
    "from torch import nn\n",
    "import torch\n",
    "class Qwen2RMSNorm(nn.Module):\n",
    "    def __init__(self, hidden_size, eps=1e-6):\n",
    "        \"\"\"\n",
    "        Qwen2RMSNorm is equivalent to T5LayerNorm\n",
    "        \"\"\"\n",
    "        super().__init__()\n",
    "        self.weight = nn.Parameter(torch.ones(hidden_size))\n",
    "        self.variance_epsilon = eps\n",
    "\n",
    "    def forward(self, hidden_states):\n",
    "        input_dtype = hidden_states.dtype\n",
    "        hidden_states = hidden_states.to(torch.float32)\n",
    "        variance = hidden_states.pow(2).mean(-1, keepdim=True)\n",
    "        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)\n",
    "        return self.weight * hidden_states.to(input_dtype)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.float32\n",
      "tensor([[ 0.3965, -0.6992, -1.8359, -0.8242, -1.3672,  1.4688,  0.1836,  1.0703,\n",
      "         -1.0859, -1.9062, -0.2969, -0.3633, -0.9648, -0.0815,  0.1748, -0.3105]],\n",
      "       device='cuda:0', grad_fn=<MulBackward0>)\n",
      "tensor([[ 0.3965, -0.6992, -1.8359, -0.8242, -1.3672,  1.4688,  0.1836,  1.0703,\n",
      "         -1.0859, -1.9062, -0.2969, -0.3633, -0.9648, -0.0815,  0.1748, -0.3105]],\n",
      "       device='cuda:0', dtype=torch.bfloat16,\n",
      "       grad_fn=<DropoutAddLayerNormFnBackward>)\n"
     ]
    }
   ],
   "source": [
    "eps = 1e-6\n",
    "dim = 16\n",
    "input  = torch.randn([1,dim],device=\"cuda:0\",dtype=torch.bfloat16)\n",
    "torch_rmsnorm = Qwen2RMSNorm(dim, eps=eps).to(\"cuda:0\")\n",
    "print(torch_rmsnorm.weight.dtype)\n",
    "output1 = torch_rmsnorm(input)\n",
    "#=\n",
    "output2 = __rms_norm(input, torch_rmsnorm.weight, torch_rmsnorm.variance_epsilon)\n",
    "print(output1)\n",
    "print(output2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# test recurrentgemma\n",
    "from transformers import AutoConfig\n",
    "from recurrentgemma.modeling_recurrent_gemma import RecurrentGemmaForCausalLM\n",
    "config = AutoConfig.from_pretrained(\"./recurrentgemma\",trust_remote_code=True)\n",
    "model = RecurrentGemmaForCausalLM(config)\n",
    "print(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# load model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/root/anaconda3/envs/pytorch/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-09-09 22:47:38,288] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
      "\u001b[93m [WARNING] \u001b[0m async_io requires the dev libaio .so object and headers but these were not found.\n",
      "\u001b[93m [WARNING] \u001b[0m async_io: please install the libaio-dev package with apt\n",
      "\u001b[93m [WARNING] \u001b[0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.\n",
      "\u001b[93m [WARNING] \u001b[0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/root/anaconda3/envs/pytorch/bin/../lib/gcc/x86_64-conda-linux-gnu/11.2.0/../../../../x86_64-conda-linux-gnu/bin/ld: cannot find -laio: 没有那个文件或目录\n",
      "collect2: error: ld returned 1 exit status\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[93m [WARNING] \u001b[0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.3\n",
      "\u001b[93m [WARNING] \u001b[0m using untested triton version (2.3.0), only 1.0.0 is known to be compatible\n",
      "/home/gujiangang/data_struct/Steel-LLM/pretrain_modify_from_TinyLlama/model\n",
      "zhanshijin: surrport flash_attn_2\n",
      "zhanshijin: if flash attn surrport window:True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "mlp_class: SteelSENet\n",
      "FFN: SteelSoftMoeV3\n",
      "zhanshijin: now use _attn_implementation is sdpa, you can choose from dict_keys(['eager', 'flash_attention_2', 'sdpa'])\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "zhanshijin: use torch rmsnorm\n"
     ]
    },
    {
     "ename": "IsADirectoryError",
     "evalue": "[Errno 21] Is a directory: '/data/model/llm/fintuned_model/70wchineseinfinity_200wchoice_selfcog_ruozhiba_cmmlu/checkpoint-27459'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mIsADirectoryError\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[1], line 20\u001b[0m\n\u001b[1;32m     18\u001b[0m model \u001b[38;5;241m=\u001b[39m SteelForCausalLM(config)\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcuda:0\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     19\u001b[0m state \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m\"\u001b[39m: model}\n\u001b[0;32m---> 20\u001b[0m \u001b[43mfabric\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43mckpt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstate\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/anaconda3/envs/pytorch/lib/python3.10/site-packages/lightning/fabric/fabric.py:775\u001b[0m, in \u001b[0;36mFabric.load\u001b[0;34m(self, path, state, strict)\u001b[0m\n\u001b[1;32m    758\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Load a checkpoint from a file and restore the state of objects (modules, optimizers, etc.)\u001b[39;00m\n\u001b[1;32m    759\u001b[0m \n\u001b[1;32m    760\u001b[0m \u001b[38;5;124;03mHow and which processes load gets determined by the `strategy`.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    772\u001b[0m \n\u001b[1;32m    773\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    774\u001b[0m unwrapped_state \u001b[38;5;241m=\u001b[39m _unwrap_objects(state)\n\u001b[0;32m--> 775\u001b[0m remainder \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_strategy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_checkpoint\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43munwrapped_state\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstrict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstrict\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    776\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbarrier()\n\u001b[1;32m    777\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m state \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    778\u001b[0m     \u001b[38;5;66;03m# We need to unwrap objects (see above) but this creates a new dictionary. In-place updates\u001b[39;00m\n\u001b[1;32m    779\u001b[0m     \u001b[38;5;66;03m# (for user metadata) wouldn't show up in the original dict, so we need to copy the data back.\u001b[39;00m\n",
      "File \u001b[0;32m~/anaconda3/envs/pytorch/lib/python3.10/site-packages/lightning/fabric/strategies/strategy.py:329\u001b[0m, in \u001b[0;36mStrategy.load_checkpoint\u001b[0;34m(self, path, state, strict)\u001b[0m\n\u001b[1;32m    310\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Load the contents from a checkpoint and restore the state of the given objects.\u001b[39;00m\n\u001b[1;32m    311\u001b[0m \n\u001b[1;32m    312\u001b[0m \u001b[38;5;124;03mArgs:\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    326\u001b[0m \n\u001b[1;32m    327\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    328\u001b[0m torch\u001b[38;5;241m.\u001b[39mcuda\u001b[38;5;241m.\u001b[39mempty_cache()\n\u001b[0;32m--> 329\u001b[0m checkpoint \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcheckpoint_io\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_checkpoint\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    330\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m state:\n\u001b[1;32m    331\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m checkpoint\n",
      "File \u001b[0;32m~/anaconda3/envs/pytorch/lib/python3.10/site-packages/lightning/fabric/plugins/io/torch_io.py:83\u001b[0m, in \u001b[0;36mTorchCheckpointIO.load_checkpoint\u001b[0;34m(self, path, map_location)\u001b[0m\n\u001b[1;32m     80\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m fs\u001b[38;5;241m.\u001b[39mexists(path):\n\u001b[1;32m     81\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCheckpoint file not found: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 83\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpl_load\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_location\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmap_location\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/anaconda3/envs/pytorch/lib/python3.10/site-packages/lightning/fabric/utilities/cloud_io.py:56\u001b[0m, in \u001b[0;36m_load\u001b[0;34m(path_or_url, map_location)\u001b[0m\n\u001b[1;32m     51\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mhub\u001b[38;5;241m.\u001b[39mload_state_dict_from_url(\n\u001b[1;32m     52\u001b[0m         \u001b[38;5;28mstr\u001b[39m(path_or_url),\n\u001b[1;32m     53\u001b[0m         map_location\u001b[38;5;241m=\u001b[39mmap_location,  \u001b[38;5;66;03m# type: ignore[arg-type]\u001b[39;00m\n\u001b[1;32m     54\u001b[0m     )\n\u001b[1;32m     55\u001b[0m fs \u001b[38;5;241m=\u001b[39m get_filesystem(path_or_url)\n\u001b[0;32m---> 56\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mfs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath_or_url\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m     57\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mload(f, map_location\u001b[38;5;241m=\u001b[39mmap_location)\n",
      "File \u001b[0;32m~/anaconda3/envs/pytorch/lib/python3.10/site-packages/fsspec/spec.py:1199\u001b[0m, in \u001b[0;36mAbstractFileSystem.open\u001b[0;34m(self, path, mode, block_size, cache_options, compression, **kwargs)\u001b[0m\n\u001b[1;32m   1197\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   1198\u001b[0m     ac \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mautocommit\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_intrans)\n\u001b[0;32m-> 1199\u001b[0m     f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_open\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1200\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1201\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1202\u001b[0m \u001b[43m        \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1203\u001b[0m \u001b[43m        \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mac\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1204\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcache_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1205\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1206\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1207\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m compression \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   1208\u001b[0m         \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfsspec\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompression\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m compr\n",
      "File \u001b[0;32m~/anaconda3/envs/pytorch/lib/python3.10/site-packages/fsspec/implementations/local.py:183\u001b[0m, in \u001b[0;36mLocalFileSystem._open\u001b[0;34m(self, path, mode, block_size, **kwargs)\u001b[0m\n\u001b[1;32m    181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mauto_mkdir \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m    182\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmakedirs(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_parent(path), exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m--> 183\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mLocalFileOpener\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/anaconda3/envs/pytorch/lib/python3.10/site-packages/fsspec/implementations/local.py:314\u001b[0m, in \u001b[0;36mLocalFileOpener.__init__\u001b[0;34m(self, path, mode, autocommit, fs, compression, **kwargs)\u001b[0m\n\u001b[1;32m    312\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompression \u001b[38;5;241m=\u001b[39m get_compression(path, compression)\n\u001b[1;32m    313\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mblocksize \u001b[38;5;241m=\u001b[39m io\u001b[38;5;241m.\u001b[39mDEFAULT_BUFFER_SIZE\n\u001b[0;32m--> 314\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/anaconda3/envs/pytorch/lib/python3.10/site-packages/fsspec/implementations/local.py:319\u001b[0m, in \u001b[0;36mLocalFileOpener._open\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf\u001b[38;5;241m.\u001b[39mclosed:\n\u001b[1;32m    318\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mautocommit \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmode:\n\u001b[0;32m--> 319\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    320\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompression:\n\u001b[1;32m    321\u001b[0m             compress \u001b[38;5;241m=\u001b[39m compr[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompression]\n",
      "\u001b[0;31mIsADirectoryError\u001b[0m: [Errno 21] Is a directory: '/data/model/llm/fintuned_model/70wchineseinfinity_200wchoice_selfcog_ruozhiba_cmmlu/checkpoint-27459'"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import sys\n",
    "import torch\n",
    "import numpy  as np\n",
    "import lightning as L\n",
    "from deepspeed.profiling.flops_profiler import get_model_profile  # type: ignore\n",
    "# fabric = L.Fabric(devices=1, precision=\"bf16-mixed\")\n",
    "fabric = L.Fabric(accelerator=\"cpu\")\n",
    "current_dir = os.getcwd()\n",
    "print(current_dir)\n",
    "sys.path.append(os.path.join(current_dir, \"steel_modify_from_qwen_1_5\"))\n",
    "from transformers import AutoConfig\n",
    "from steel_modify_from_qwen_1_5.modeling_steel import SteelForCausalLM\n",
    "config = AutoConfig.from_pretrained(\"./steel_modify_from_qwen_1_5\",trust_remote_code=True)\n",
    "# config.mlp_type = \"raw\"\n",
    "# config.FFN_type = \"softmoe_v3\"\n",
    "ckpt = \"/data/model/llm/lightning_model/step-1060000-iter-8480000-ckpt/state.pth\"\n",
    "model = SteelForCausalLM(config).to(\"cpu\")\n",
    "state = {\"model\": model}\n",
    "fabric.load(ckpt, state)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
      "/root/anaconda3/envs/pytorch/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:515: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.01` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n",
      "  warnings.warn(\n",
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "151643\n",
      "system\n",
      "You are a helpful assistant.\n",
      "user\n",
      "重力加速度是\n",
      "assistant\n",
      "10m/s2, 5m/s2, 3m/s2, 2\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"./tokenizer_from_qwen_moe_chat\")\n",
    "print(tokenizer.pad_token_id)\n",
    "string = '''重力加速度是'''\n",
    "messages = [\n",
    "    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
    "    {\"role\": \"user\", \"content\": string}\n",
    "]\n",
    "text = tokenizer.apply_chat_template(\n",
    "    messages,\n",
    "    tokenize=False,\n",
    "    add_generation_prompt=True\n",
    ")\n",
    "# string = \"python怎么写for循环\"\n",
    "model_inputs = tokenizer([text], return_tensors=\"pt\").to(model.device)\n",
    "generated_ids = model.generate(\n",
    "    model_inputs.input_ids,\n",
    "    max_new_tokens=20,\n",
    "    repetition_penalty = 1.1,\n",
    "    temperature = 0.01,\n",
    ")\n",
    "# generated_ids = [\n",
    "#     output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)\n",
    "# ]\n",
    "response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]\n",
    "print(response)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "保存模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save_pretrained(\"/data/model/llm/hf_model/steel-llm-step-1060000-ckpt\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "hf加载模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "mlp_class: SteelSENet\n",
      "FFN: SteelSoftMoeV3\n",
      "zhanshijin: now use _attn_implementation is sdpa, you can choose from dict_keys(['eager', 'flash_attention_2', 'sdpa'])\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "zhanshijin: surrport flash_attn_2\n",
      "zhanshijin: if flash attn surrport window:True\n",
      "zhanshijin: use torch rmsnorm\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# model_dir = \"/data/model/llm/fintuned_model/from_pretrain_70wchineseinfinity_200wchoice_selfcog_ruozhiba/checkpoint-27369\"\n",
    "model_dir = \"/data/model/llm/Steel-LLM/steel-llm-chat-v1\"\n",
    "import sys\n",
    "sys.path.append(model_dir)\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
    "tokenizer = AutoTokenizer.from_pretrained(\n",
    "        model_dir, trust_remote_code=True\n",
    "    )\n",
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "        model_dir, device_map=\"cuda:0\", trust_remote_code=True\n",
    "    ).eval()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "151643\n",
      "system\n",
      "You are a helpful assistant.\n",
      "user\n",
      "你叫什么名字\n",
      "assistant\n",
      "我是一个人工智能助手，没有实际的姓名。我的主要功能是帮助用户解决问题和提供信息。如果您需要任何其他类型的信息或服务，请告诉我。\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_dir)\n",
    "print(tokenizer.pad_token_id)\n",
    "string = '''你叫什么名字'''\n",
    "# string = \"\"\"在OSI参考模型中，直接为会话层提供服务的是，请直接给出选项:\\nA. 应用层\\nB. 表示层\\nC. 传输层\\nD. 网络层\"\"\"\n",
    "#string = \"\"\"A new fuel cell for notebook PCs, more compact and powerful than competing technologies, could be on the market in early 2006 at a price of around US\\\\$90, its Japanese inventors said Tuesday.\\n\\nWhat best summarizes the content of the above article?\\npick from the following. (I). World. (II). Sports. (III). Business. (IV). Science/Tech.\"\"\"\n",
    "#string = \"\"\"你是谁\"\"\"\n",
    "messages = [\n",
    "    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
    "    {\"role\": \"user\", \"content\": string}\n",
    "]\n",
    "text = tokenizer.apply_chat_template(\n",
    "    messages,\n",
    "    tokenize=False,\n",
    "    add_generation_prompt=True\n",
    ")\n",
    "# string = \"python怎么写for循环\"\n",
    "model_inputs = tokenizer([text], return_tensors=\"pt\").to(model.device)\n",
    "generated_ids = model.generate(\n",
    "    model_inputs.input_ids,\n",
    "    max_new_tokens=200,\n",
    "    repetition_penalty = 1.1,\n",
    "    temperature = 0.0001,\n",
    ")\n",
    "# generated_ids = [\n",
    "#     output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)\n",
    "# ]\n",
    "response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]\n",
    "print(response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.push_to_hub(\"gqszhanshijin/Steel-LLM\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# softmoe v1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([1, 30, 10])\n"
     ]
    }
   ],
   "source": [
    "from torch import nn\n",
    "import torch.nn.functional as F\n",
    "import torch\n",
    "from transformers import AutoConfig\n",
    "config = AutoConfig.from_pretrained(\"./steel_modify_from_qwen_1_5\",trust_remote_code=True)\n",
    "\n",
    "class SteelSoftMoeV1(nn.Module):\n",
    "    def __init__(self, config, layer=None):\n",
    "        super().__init__()\n",
    "        self.config = config\n",
    "        # self.experts = nn.ModuleList([layer(config) for _ in range(config.n_experts)])\n",
    "        self.experts = nn.ModuleList([nn.Linear(config.hidden_size, 10) for _ in range(config.n_experts)])\n",
    "        self.gating = nn.Linear(config.hidden_size, config.n_experts)\n",
    "    def forward(self, x):\n",
    "        weights = self.gating(x)\n",
    "        weights = nn.functional.softmax(weights, dim=-1, dtype=torch.float32).to(x.dtype)\n",
    "        outputs = torch.stack( \n",
    "            [expert(x) for expert in self.experts], dim=2) \n",
    "        weights = weights.unsqueeze(-1)\n",
    "        return torch.sum(outputs * weights, dim=2)\n",
    "    \n",
    "fake_input = torch.randn([1,30,config.hidden_size])\n",
    "layer = SteelSoftMoeV1(config)\n",
    "print(layer(fake_input).shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.float32\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "a = torch.tensor([1.0,2.0], device=\"cuda:0\",dtype=torch.bfloat16)\n",
    "b = torch.tensor([1.0,2.0], device=\"cuda:0\",dtype=torch.float32)\n",
    "c = a- b\n",
    "print(c.dtype)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pytorch",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
